@article{fu2021violet,
  title={VIOLET: End-to-End Video-Language Transformers with Masked Visual-token Modeling},
  author={Fu, Tsu-Jui and Li, Linjie and Gan, Zhe and Lin, Kevin and Wang, William Yang and Wang, Lijuan and Liu, Zicheng},
  journal={arXiv preprint arXiv:2111.12681},
  year={2021}
}